SE object created from exploratory analysis.
Gene selection according to Biotype already performed
SE_Bio <- readRDS(paste0(params$SE_Bio))Duplicated gene names are dropped and gene IDs are set as rownames.
The number of duplicated GeneName is: 11899
The number of duplicated Ensembl Genes with version is: 0
SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment
## dim: 24708 36
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')
padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_thresholdOnly day25 Cortical Brain Organoids are kept
SE_Bio_d25CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd25']
colData(SE_Bio_d25CbO)
## DataFrame with 13 rows and 11 columns
## SampleNumber SampleID_GU SampleID_OG SampleName SampleType Genotype
## <integer> <character> <character> <character> <character> <character>
## OLE_007 5 D25_2_WT D25_2_WT ORG_CHD2_WT_round2A_.. CorticalOrganoids WT
## OLE_008 6 D25_2_HT D25_2_HT ORG_CHD2_HT_round2A_.. CorticalOrganoids HT
## OLE_013 9 D25_3b_WT D25_3b_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids WT
## OLE_014 10 D25_3b_HT D25_3b_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids HT
## OLE_031 18 OLE_031 OL06 6_Evo1_d25_WT3 CorticalOrganoids WT
## ... ... ... ... ... ... ...
## OLE_035 22 OLE_035 OL10 10_Evo3_d25_WTC_1-2 CorticalOrganoids WT
## OLE_036 23 OLE_036 OL11 11_Evo3_d25_A5_1-2 CorticalOrganoids AR
## OLE_037 24 OLE_037 OL12 12_Evo4_d25_WTC-A CorticalOrganoids WT
## OLE_038 25 OLE_038 OL13 13_Evo4_d25_A5-A CorticalOrganoids AR
## OLE_039 26 OLE_039 OL14 14_Evo4_d25_HET-A CorticalOrganoids HT
## Timepoint Batch SeqRun HRID lib.size
## <character> <character> <integer> <character> <numeric>
## OLE_007 d25 Round2 1 OLE_007 17223455
## OLE_008 d25 Round2 1 OLE_008 18879578
## OLE_013 d25 Round3 1 OLE_013 20419528
## OLE_014 d25 Round3 1 OLE_014 41252167
## OLE_031 d25 Evo1 2 OLE_031 24061399
## ... ... ... ... ... ...
## OLE_035 d25 Evo3 2 OLE_035 18643755
## OLE_036 d25 Evo3 2 OLE_036 21633783
## OLE_037 d25 Evo4 2 OLE_037 22432181
## OLE_038 d25 Evo4 2 OLE_038 20928321
## OLE_039 d25 Evo4 2 OLE_039 19103731CountMatrix <- assays(SE_Bio_d25CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d25CbO))
all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUEdds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d25CbO)$counts, DataFrame(colData(SE_Bio_d25CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors
mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d25CbO)))
dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")
dds$Genotype
## [1] WT HT WT HT WT AR HT HT WT AR WT AR HT
## Levels: WT AR HT
dds
## class: DESeqDataSet
## dim: 24708 13
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeZeroPlot <- CountMatrix %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "col", values_to = "value") %>%
filter(value == 0) %>%
group_by(col) %>%
summarise(zerocount = n()) %>%
ggplot(., aes(y=col, x=zerocount)) +
geom_col(col='black', fill='#76c8c8') +
coord_flip() +
geom_label(aes(label=zerocount)) +
labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
y='', x='') +
theme_bw() +
theme(axis.text = element_text(colour = 'black', size=7),
axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
plot.title = element_text(hjust = 0.5, size = 7))
ZeroPlotkeep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3
table(keep)
## keep
## FALSE TRUE
## 6927 17781
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 17781 13
## metadata(1): version
## assays(1): counts
## rownames(17781): TSPAN6 TNMD ... PDCD6-AHRR LNCDAT
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeA dds object containing information about 17781 genes in 13 samples is tested for differential expression.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
## OLE_007 OLE_008 OLE_013 OLE_014 OLE_031 OLE_032 OLE_033 OLE_034 OLE_035
## 0.7693623 0.8724444 0.9194546 1.8788253 1.1027455 1.0629631 0.9634343 1.0544527 0.8574659
## OLE_036 OLE_037 OLE_038 OLE_039
## 0.9944899 1.0366067 0.9750425 0.8666412select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))
rownames(df) <- colnames(ntd)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=TRUE, annotation_col=df, border_color = 'black')vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, main = 'Samples Distances', name = 'vst')SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))
par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
plotDensity(log2(counts(dds)), col=ScaledCols,
xlab="log2(counts)", cex.lab=0.7, panel.first=grid())
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols,
xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) plotDispEsts(dds)res_dds_ar <- results(dds, contrast=c("Genotype","AR","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ar)
##
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 81, 0.46%
## LFC < 0 (down) : 90, 0.51%
## outliers [1] : 0, 0%
## low counts [2] : 1379, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
metadata(res_dds_ar)$filterThreshold
## 7.755102%
## 5.155949
metadata(res_dds_ar)$alpha
## [1] 0.05
plot(metadata(res_dds_ar)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter", main='Ancestral',
cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ar)$lo.fit, col="red")
abline(v=metadata(res_dds_ar)$filterTheta)head(res_dds_ar[order(res_dds_ar$padj),])
## log2 fold change (MLE): Genotype AR vs WT
## Wald test p-value: Genotype AR vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## PCDHA2 172.146 -3.63859 0.335444 -10.84706 2.05951e-27 2.81403e-23
## LINC01515 195.870 -5.03618 0.466301 -10.80029 3.43132e-27 2.81403e-23
## SLC15A4 124.413 -4.02533 0.397896 -10.11655 4.66586e-24 2.55098e-20
## FZD10 1452.125 -5.87179 0.620686 -9.46017 3.07443e-21 1.26067e-17
## ZNF100 556.842 -1.45890 0.156480 -9.32320 1.12888e-20 3.70317e-17
## UBE2T 701.970 -1.01053 0.113299 -8.91919 4.69711e-19 1.28403e-15Genes modulated considering a FDR threshold of 0.1: 268
Genes modulated considering a FDR threshold of 0.05: 171
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 111
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 74
Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.
resAshr_ar <- lfcShrink(dds, contrast=c("Genotype","AR","WT"), res=res_dds_ar, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ar)
##
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 81, 0.46%
## LFC < 0 (down) : 90, 0.51%
## outliers [1] : 0, 0%
## low counts [2] : 1379, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ar, xlim=xlim, ylim=ylim, main="no LFC shrink")DESeq2::plotMA(resAshr_ar, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")head(resAshr_ar[order(resAshr_ar$padj),])
## log2 fold change (MMSE): Genotype AR vs WT
## Wald test p-value: Genotype AR vs WT
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## PCDHA2 172.146 -3.602244 0.334754 2.05951e-27 2.81403e-23
## LINC01515 195.870 -4.945350 0.462591 3.43132e-27 2.81403e-23
## SLC15A4 124.413 -3.970674 0.395832 4.66586e-24 2.55098e-20
## FZD10 1452.125 -5.691165 0.613107 3.07443e-21 1.26067e-17
## ZNF100 556.842 -1.395251 0.157343 1.12888e-20 3.70317e-17
## UBE2T 701.970 -0.980601 0.112123 4.69711e-19 1.28403e-15Genes modulated considering a FDR threshold of 0.1: 268
Genes modulated considering a FDR threshold of 0.05: 171
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 80
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 40
Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.
deseqDEGsAR <- dplyr::filter(data.frame(res_dds_ar), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))
deseqDEGsARashr <- dplyr::filter(data.frame(resAshr_ar), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))FdrCeil = 1e-10
logFcCeil = 7.5#rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'AR vs WT')
dplyr::rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'AR vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)#rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'AR vs WT')
dplyr::rename(as.data.frame(resAshr_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'AR vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)DEAList <- list()
DEAList <- list(dds = dds, #same for both
AR = list(res = res_dds_ar,
DEGs = deseqDEGsAR,
resAshr =resAshr_ar,
DEGsAshr = deseqDEGsARashr))# RDS
saveRDS(DEAList, paste0(SavingFolder, '/day25CbO.', 'DEAList_AR.rds'))
saveRDS(SE_deseq2, paste0(SavingFolder, '/day25CbO.', 'SE_deseq2_AR.rds'))
saveRDS(res_dds_ar, paste0(SavingFolder, '/day25CbO.', 'deseqARvsWT.rds'))
SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/day25CbO.', 'DEAnalysisWorkspace_AR.RData'))